import os, re, shutil


sourcedir = 'C:/Users/tabi/Dropbox/BonillaLlloydMo/TextAnalysis/articles_newformat'
dirList = [os.path.join(sourcedir, x) for x in os.listdir(sourcedir)]

i =  open("C:/Users/tabi/Dropbox/BonillaLlloydMo/TextAnalysis/USnewspaperList.txt", "r") 
us_papers = []

for paper in i.readlines():
    us_papers.append(paper.split("(")[0].strip())
    us_papers.append(paper.strip())
     
        

inc = []
excluded = [] 

index = 0               	

for d in range(len(dirList)): 
	a = open(dirList[d], 'r')
	lines = a.readlines()
	
	for line in lines: 
	       if line == "\n": 
	               continue
	       else: 
	           title = line.strip()
	           
	           if title in us_papers: 
	               inc.append(title)
	               
	               i = lines.index(line)
	               currentLine=lines[i + 2]
	               c_file = ""
	               year=""
	               while year == "":
    	                   year_val = re.search(r'\d\d\d\d', currentLine)
    	                   if year_val: 
        	                   year = year_val.group()
        	                  
        	                   c_file = str(year) + "_" + str(index) 
        	                   
    	                   i += 1
    	                   currentLine = lines[i]
	               if not c_file:
	                   print dirList[d]
	               
	               '''
	               i = lines.index(line)
	               year = lines[i+2] 
	               year = re.search(r'\d\d\d\d', year)
    	               if not year:
    	                   print dirList[d]
    	               year = year.group()
    	               
        	       c_file = str(year) + "_" + str(index) 
	               
	                   
    	               '''
    	                   	               
	               shutil.copyfile(dirList[d],os.path.join('articles_year', c_file))
	               index = index + 1
	           else:
	               if title not in excluded:
	                   excluded.append(title)
   
   	           break
   	           
	
	'''
	date = lines[4]
	
	for j in range(6:len(lines)): 
	    
	   if lines[i] no :, then subtitle = line[6]
	
	if lines[6] is :,
	   then new = line[6].split(":")
	   str(new[1]) = new[2] (for
	   each) 
	   
	   if no : : 
 	      article = article + lines 
 	      
 	      
	'''
out_put = open("C:/Users/tabi/Dropbox/BonillaLlloydMo/TextAnalysis/excluded_list.csv",'w')
print >> out_put , "\n".join(excluded)
out_put.close()
    
